library(stringr)
library(feather)
library(koRpus)
library(tidyverse)
library(lme4)
library(dplyr)
library(sjPlot)
library(corrplot)
library(tidytext)
library(tm)
library(childesr)
# Load data
childes_all <- read_feather("/Users/Yawen/Desktop/lexical diversity/triaL6_childes/childes_all.feather")%>%
  filter(age >=14 & age <=58) # align age range with that of LDP

ldp_all <- read_feather("/Users/Yawen/Desktop/lexical diversity/trial5_ldp/ldp_all.feather")

# Plot Growth Curve
childes_all %>%
  ggplot()+
  geom_smooth(aes(x=age,y=scale(kid_ttr), color="ttr"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mattr), color="mattr"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_vocd), color="vocd"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mtld), color="mtld"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mlu), color="mlu"), se=F)+
  theme_classic()+
  labs(title = "CHILDES: Growth Curve of Lexical Diversity", 
       subtitle = "14 ~ 58 Months",
       y = "lexical diversity (scaled)")

# facet by UK/US group
childes_all %>%
  ggplot()+
  geom_smooth(aes(x=age,y=scale(kid_ttr), color="ttr"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mattr), color="mattr"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_vocd), color="vocd"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mtld), color="mtld"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mlu), color="mlu"), se=F)+
  facet_grid(~group)+
  theme_classic()+
  labs(title = "CHILDES: Growth Curve of Lexical Diversity", 
       subtitle = "14 ~ 58 Months",
       y = "lexical diversity (scaled)")

# compare with CDI 
ldp_all%>%
  group_by(subject)%>%
  ggplot()+
  geom_smooth(aes(x=age,y=scale(kid_ttr), color="ttr"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mattr), color="mattr"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_vocd), color="vocd"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mtld), color="mtld"), se=F)+
  geom_smooth(aes(x=age, y=scale(cdi), color="CDI"),se=F)+
  xlim(14,30)+
  theme_classic()+
  labs(title = "Compare Lexical Diversity Indices with CDI", 
       subtitle = "LDP: 18 ~ 30 Months",
       y = "lexical diversity (scaled)")

# compare with PPVT 
ldp_all%>%
  group_by(subject)%>%
  ggplot()+
  geom_smooth(aes(x=age,y=scale(kid_ttr), color="ttr"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mattr), color="mattr"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_vocd), color="vocd"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mtld), color="mtld"), se=F)+
  geom_smooth(aes(x=age, y=scale(ppvt), color="PPVT"),se=F)+
  xlim(30, 53)+
  theme_classic()+
  labs(title = "Compare Lexical Diversity Indices with PPVT", 
       subtitle = "LDP: 30 ~ 53 Months",
       y = "lexical diversity (scaled)")

# TTR vs MATTR
childes_all %>%
  ggplot()+
  geom_smooth(aes(x=age,y=scale(kid_ttr), color="ttr"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mattr), color="mattr"), se=F)+
  theme_classic()+
  labs(title = "Growth Curve by TTR & MATTR",
       subtitle = "CHILDES: 14 ~ 58 Months",
       y = "lexical diversity (scaled)")

ldp_all %>%
  ggplot()+
  geom_smooth(aes(x=age,y=scale(kid_ttr), color="ttr"), se=F)+
  geom_smooth(aes(x=age,y=scale(kid_mattr), color="mattr"), se=F)+
  theme_classic()+
  labs(title = "Children's Growth Curve by TTR & MATTR",
       subtitle = "LDP: 14 ~ 58 Months",
       y = "lexical diversity (scaled)")

# MTLD vs vocd-D (Kid)
childes_all %>%
  ggplot()+
  geom_smooth(aes(x=age,y=kid_vocd, color="vocd"), se=F)+
  geom_smooth(aes(x=age,y=kid_mtld, color="mtld"), se=F)+
  theme_classic()+
  labs(title = "Children's Growth Curve by MTLD & vocd-D",
       subtitle = "CHILDES: 14 ~ 58 Months",
       y = "lexical diversity (scaled)")

ldp_all %>%
  ggplot()+
  geom_smooth(aes(x=age,y=kid_vocd, color="vocd"), se=F)+
  geom_smooth(aes(x=age,y=kid_mtld, color="mtld"), se=F)+
  theme_classic()+
  labs(title = "Children's Growth Curve by MTLD & vocd-D",
       subtitle = "LDP: 14 ~ 58 Months",
       y = "lexical diversity (scaled)")

# MTLD vs vocd-D (Mother)
childes_all %>%
  ggplot()+
  geom_smooth(aes(x=age,y=mom_vocd, color="vocd"), se=F)+
  geom_smooth(aes(x=age,y=mom_mtld, color="mtld"), se=F)+
  theme_classic()+
  labs(title = "Mother's Growth Curve by MTLD & vocd-D",
       subtitle = "CHILDES: 14 ~ 58 Months",
       y = "lexical diversity (scaled)")

ldp_all %>%
  ggplot()+
  geom_smooth(aes(x=age,y=mom_vocd, color="vocd"), se=F)+
  geom_smooth(aes(x=age,y=mom_mtld, color="mtld"), se=F)+
  theme_classic()+
  labs(title = "Mother's Growth Curve by MTLD & vocd-D",
       subtitle = "LDP: 14 ~ 58 Months",
       y = "lexical diversity (scaled)")

Compare by Variance of parameters

childes_intercept <- read_feather("/Users/Yawen/Desktop/lexical diversity/trial5_ldp/childes_intercept.feather")
ldp_intercept <- read_feather("/Users/Yawen/Desktop/lexical diversity/trial5_ldp/ldp_intercept.feather")

# variance of children's intercept
ldp_intercept %>%
  ungroup(.)%>%
  gather(measure, value, 
         cdi_intercept, ppvt_intercept, mtld_intercept, 
         mattr_intercept, vocd_intercept, ttr_intercept, 
         mlu_intercept, sen_intercept) %>%
  group_by(measure)%>%
  summarise(mean = mean(value, na.rm=TRUE),
            sd = sd(value, na.rm=TRUE),
            coef_of_var = sd/mean)
## # A tibble: 8 x 4
##           measure        mean           sd coef_of_var
##             <chr>       <dbl>        <dbl>       <dbl>
## 1   cdi_intercept 497.5050370 152.25178818  0.30603065
## 2 mattr_intercept   0.4157072   0.04125706  0.09924548
## 3   mlu_intercept   2.3530655   0.35607149  0.15132239
## 4  mtld_intercept  12.6276056   2.44016389  0.19324043
## 5  ppvt_intercept  27.4537313  10.91072920  0.39742245
## 6   sen_intercept  19.2577319  11.35482955  0.58962445
## 7   ttr_intercept   0.1934905   0.02762355  0.14276435
## 8  vocd_intercept  29.1841902   1.77017781  0.06065537
childes_intercept %>%
  ungroup(.)%>%
  gather(measure, value, 
         mtld_intercept,mattr_intercept, vocd_intercept, 
         ttr_intercept, mlu_intercept) %>%
  group_by(measure)%>%
  summarise(mean = mean(value, na.rm=TRUE),
            sd = sd(value, na.rm=TRUE),
            coef_of_var = sd/mean)
## # A tibble: 5 x 4
##           measure       mean         sd coef_of_var
##             <chr>      <dbl>      <dbl>       <dbl>
## 1 mattr_intercept  0.5575220 0.04228984  0.07585322
## 2   mlu_intercept  3.4293445 0.70482848  0.20552863
## 3  mtld_intercept 15.8337543 3.27785922  0.20701718
## 4   ttr_intercept  0.2347557 0.10931233  0.46564283
## 5  vocd_intercept 30.6584564 2.19072708  0.07145588
ldp_intercept %>%
  gather(measure, value, 
         cdi_intercept, ppvt_intercept, mtld_intercept, 
         mattr_intercept, vocd_intercept, ttr_intercept, 
         mlu_intercept, sen_intercept) %>%
  ggplot(aes(x = value)) +
  facet_grid(~ measure, scales = "free_x") +
  geom_histogram()+
  theme_classic()+
  labs(title = "coef_of_var of Children's Intercept",
       subtitle = "LDP: 14 ~ 58 Months")

childes_intercept %>%
  ungroup(.)%>%
  gather(measure, value, 
         mtld_intercept,mattr_intercept, vocd_intercept, 
         ttr_intercept, mlu_intercept) %>%
  ggplot(aes(x = value)) +
  facet_grid(~ measure, scales = "free_x") +
  geom_histogram()+
  theme_classic()+
  labs(title = "Variance of Children's Intercept",
       subtitle = "CHILDES: 14 ~ 58 Months")

# variance of children's slope
ldp_intercept %>%
  ungroup(.)%>%
  gather(measure, value, 
         cdi_slope, ppvt_slope, mtld_slope, mattr_slope,
         vocd_slope, ttr_slope, mlu_slope, sen_slope) %>%
  group_by(measure)%>%
  summarise(mean = mean(value, na.rm=TRUE),
            sd = sd(value, na.rm=TRUE),
            coef_of_var =sd/mean)
## # A tibble: 8 x 4
##       measure         mean           sd coef_of_var
##         <chr>        <dbl>        <dbl>       <dbl>
## 1   cdi_slope 836.30454069 131.54764297  0.15729634
## 2 mattr_slope   0.22790449   0.05044707  0.22135179
## 3   mlu_slope   2.34837035   0.22141643  0.09428514
## 4  mtld_slope  18.03235698   2.86834299  0.15906645
## 5  ppvt_slope  77.49354350  18.14881161  0.23419773
## 6   sen_slope  39.04148776  19.12625515  0.48989565
## 7   ttr_slope  -0.02842861   0.04605768 -1.62011720
## 8  vocd_slope  10.33666075   2.91275673  0.28178895
childes_intercept %>%
  ungroup(.)%>%
  gather(measure, value, 
         mtld_slope, mattr_slope, vocd_slope, ttr_slope, mlu_slope) %>%
  group_by(measure)%>%
  summarise(mean = mean(value, na.rm=TRUE),
            sd = sd(value, na.rm=TRUE),
            coef_of_var =sd/mean)
## # A tibble: 5 x 4
##       measure        mean         sd coef_of_var
##         <chr>       <dbl>      <dbl>       <dbl>
## 1 mattr_slope  0.19600032 0.09697797  0.49478477
## 2   mlu_slope  1.22627619 0.09686533  0.07899144
## 3  mtld_slope 18.97232656 4.91981046  0.25931508
## 4   ttr_slope -0.08196691 0.09686533 -1.18176134
## 5  vocd_slope  7.46758510 6.60336456  0.88427041
ldp_intercept %>%
  gather(measure, value, 
         cdi_slope, ppvt_slope, mtld_slope, mattr_slope,
         vocd_slope, ttr_slope, mlu_slope, sen_slope) %>%
  ggplot(aes(x = value)) +
  facet_grid(~ measure,scales = "free_x") +
  geom_histogram()+
  theme_classic()+
  labs(title = "Variance of Children's Slope",
       subtitle = "LDP: 14 ~ 58 Months")

childes_intercept %>%
  ungroup(.)%>%
  gather(measure, value, 
         mtld_slope, mattr_slope, vocd_slope, ttr_slope, mlu_slope) %>%
  ggplot(aes(x = value)) +
  facet_grid(~ measure,scales = "free_x") +
  geom_histogram()+
  theme_classic()+
  labs(title = "Variance of Children's Slope",
       subtitle = "CHILDES: 14 ~ 58 Months")

# variance of mother's intercept
ldp_intercept %>%
  ungroup(.)%>%
  gather(measure, value, 
         mom_mtld_intercept, mom_mattr_intercept, 
         mom_vocd_intercept, mom_ttr_intercept,
         mom_mlu_intercept) %>%
  group_by(measure)%>%
  summarise(mean = mean(value, na.rm=TRUE),
            sd = sd(value, na.rm=TRUE),
            coef_of_var = sd/mean)
## # A tibble: 5 x 4
##               measure       mean         sd coef_of_var
##                 <chr>      <dbl>      <dbl>       <dbl>
## 1 mom_mattr_intercept  0.5634104 0.02346517  0.04164845
## 2   mom_mlu_intercept  4.1064351 0.40644597  0.09897782
## 3  mom_mtld_intercept 31.4707075 5.16404671  0.16409058
## 4   mom_ttr_intercept  0.1588064 0.04021612  0.25323995
## 5  mom_vocd_intercept 34.2712969 0.54257524  0.01583177
childes_intercept %>%
  ungroup(.)%>%
  gather(measure, value, 
         mom_mtld_intercept, mom_mattr_intercept, 
         mom_vocd_intercept, mom_ttr_intercept, mom_mlu_intercept) %>%
  group_by(measure)%>%
  summarise(mean = mean(value, na.rm=TRUE),
            sd = sd(value, na.rm=TRUE),
            coef_of_var = sd/mean)
## # A tibble: 5 x 4
##               measure       mean        sd coef_of_var
##                 <chr>      <dbl>     <dbl>       <dbl>
## 1 mom_mattr_intercept  0.6784563 0.0250928  0.03698514
## 2   mom_mlu_intercept  4.2069000 0.8785460  0.20883453
## 3  mom_mtld_intercept 34.0199253 5.2203424  0.15344956
## 4   mom_ttr_intercept  0.2655001 0.1355026  0.51036743
## 5  mom_vocd_intercept 32.9539727 0.3893581  0.01181521
ldp_intercept %>%
  gather(measure, value, 
         mom_mtld_intercept, mom_mattr_intercept, 
         mom_vocd_intercept, mom_ttr_intercept,
         mom_mlu_intercept) %>%
  ggplot(aes(x = value)) +
  facet_grid(~ measure, scales = "free_x") +
  geom_histogram()+
  theme_classic()+
  labs(title = "Variance of Mother's Intercept",
       subtitle = "LDP: 14 ~ 58 Months")

childes_intercept %>%
  gather(measure, value, 
         mom_mtld_intercept, mom_mattr_intercept, 
         mom_vocd_intercept, mom_ttr_intercept, mom_mlu_intercept) %>%
  ggplot(aes(x = value)) +
  facet_grid(~ measure, scales = "free_x") +
  geom_histogram()+
  theme_classic()+
  labs(title = "Variance of Mother's Intercept",
       subtitle = "CHILDES: 14 ~ 58 Months")

# variance of mother's slope
ldp_intercept %>%
  ungroup(.)%>%
  gather(measure, value, 
         mom_mtld_slope, mom_mattr_slope, 
         mom_vocd_slope, mom_ttr_slope,mom_mlu_intercept) %>%
  group_by(measure)%>%
  summarise(mean = mean(value, na.rm=TRUE),
            sd = sd(value, na.rm=TRUE),
            coef_of_var =sd/mean)
## # A tibble: 5 x 4
##             measure        mean         sd coef_of_var
##               <chr>       <dbl>      <dbl>       <dbl>
## 1   mom_mattr_slope  0.06334250 0.01283598  0.20264410
## 2 mom_mlu_intercept  4.10643507 0.40644597  0.09897782
## 3    mom_mtld_slope 14.80122629 2.57400445  0.17390481
## 4     mom_ttr_slope  0.04688715 0.05098465  1.08739070
## 5    mom_vocd_slope  0.56419163 0.23421548  0.41513463
childes_intercept %>%
  ungroup(.)%>%
  gather(measure, value, 
         mom_mtld_slope, mom_mattr_slope, mom_vocd_slope, 
         mom_ttr_slope, mom_mlu_slope) %>%
  group_by(measure)%>%
  summarise(mean = mean(value, na.rm=TRUE),
            sd = sd(value, na.rm=TRUE),
            coef_of_var =sd/mean)
## # A tibble: 5 x 4
##           measure        mean         sd coef_of_var
##             <chr>       <dbl>      <dbl>       <dbl>
## 1 mom_mattr_slope  0.05056424 0.03349923   0.6625082
## 2   mom_mlu_slope  0.58665441 0.58185294   0.9918155
## 3  mom_mtld_slope 10.82447528 5.96172968   0.5507638
## 4   mom_ttr_slope  0.03423628 0.04476633   1.3075696
## 5  mom_vocd_slope  0.67090861 0.80378108   1.1980485
ldp_intercept %>%
  gather(measure, value, 
         mom_mtld_slope, mom_mattr_slope, mom_vocd_slope, 
         mom_ttr_slope,mom_mlu_slope) %>%
  ggplot(aes(x = value)) +
  facet_grid(~ measure,scales = "free_x") +
  geom_histogram()+
  theme_classic()+
  labs(title = "Variance of Mother's Slope",
       subtitle = "LDP: 14 ~ 58 Months")

childes_intercept %>%
  gather(measure, value, 
         mom_mtld_slope, mom_mattr_slope, mom_vocd_slope, 
         mom_ttr_slope, mom_mlu_slope) %>%
  ggplot(aes(x = value)) +
  facet_grid(~ measure,scales = "free_x") +
  geom_histogram()+
  theme_classic()+
  labs(title = "Variance of Mother's Slope",
       subtitle = "CHILDES: 14 ~ 58 Months")

Compare by Correlation among Parameters

# correlation plot of child's intercept
ldp_intercept%>%
  ungroup()%>%
  filter(complete.cases(.))%>%
  select(cdi_intercept, ppvt_intercept, mtld_intercept, 
         mattr_intercept, vocd_intercept, ttr_intercept, 
         mlu_intercept, sen_intercept)%>% 
  cor() %>%
  corrplot::corrplot(method = "square", type = "upper")

childes_intercept%>%
  ungroup()%>%
  filter(complete.cases(.))%>%
  select(mtld_intercept,mattr_intercept, vocd_intercept, 
         ttr_intercept,mlu_intercept)%>% 
  cor() %>%
  corrplot::corrplot(method = "number", type = "upper")

# correlation plot of mother's intercept
ldp_intercept%>%
  ungroup()%>%
  filter(complete.cases(.))%>%
  select(mom_mtld_intercept, mom_mattr_intercept, 
         mom_vocd_intercept, mom_ttr_intercept,
         mom_mlu_intercept)%>% 
  cor() %>%
  corrplot::corrplot(method = "number", type = "upper")

childes_intercept%>%
  ungroup()%>%
  filter(complete.cases(.))%>%
  select(mom_mtld_intercept, mom_mattr_intercept, 
         mom_vocd_intercept, mom_ttr_intercept,
         mom_mlu_intercept)%>% 
  cor() %>%
  corrplot::corrplot(method = "number", type = "upper")

# correlation plot of child's slope
ldp_intercept %>%
  filter(complete.cases(.))%>%
  select(cdi_slope, ppvt_slope, mtld_slope, mattr_slope, 
         vocd_slope, ttr_slope, sen_slope, mlu_slope)%>% 
  cor() %>%
  corrplot::corrplot(method = "square", type = "upper")

childes_intercept%>%
  filter(complete.cases(.))%>%
  select(mtld_slope, mattr_slope,
         vocd_slope, ttr_slope, mlu_slope)%>% 
  cor() %>%
  corrplot::corrplot(method = "number", type = "upper")

# correlation plot of mother's slope
ldp_intercept%>%
  ungroup()%>%
  filter(complete.cases(.))%>%
  select(mom_mtld_slope, mom_mattr_slope, mom_vocd_slope, mom_ttr_slope)%>% 
  cor() %>%
  corrplot::corrplot(method = "number", type = "upper")

childes_intercept%>%
  ungroup()%>%
  filter(complete.cases(.))%>%
  select(mom_mtld_slope, mom_mattr_slope, 
         mom_vocd_slope, mom_ttr_slope, mom_mlu_slope)%>% 
  cor() %>%
  corrplot::corrplot(method = "number", type = "upper")

# plot all parameters of children
ldp_intercept%>%
  filter(complete.cases(.))%>%
  select(cdi_intercept, ppvt_intercept, mtld_intercept, mattr_intercept, 
         vocd_intercept, ttr_intercept, mlu_intercept, sen_intercept,
         cdi_slope, ppvt_slope, mtld_slope, mattr_slope, vocd_slope, 
         ttr_slope, mlu_slope, sen_slope)%>%
  cor() %>%
  corrplot::corrplot(method = "square", type="upper")

childes_intercept%>%
  filter(complete.cases(.))%>%
  select(mtld_intercept, mattr_intercept,vocd_intercept, ttr_intercept,
         mtld_slope, mattr_slope, vocd_slope, ttr_slope, 
         mlu_intercept, mlu_slope)%>%
  cor() %>%
  corrplot::corrplot(method = "square", type="upper")

# plot all parametes of mothers
ldp_intercept%>%
  filter(complete.cases(.))%>%
  select(mom_mtld_intercept, mom_mattr_intercept, mom_vocd_intercept, 
         mom_ttr_intercept, mom_mlu_intercept, mom_mtld_slope, 
         mom_mattr_slope, mom_vocd_slope, mom_ttr_slope, mom_mlu_slope)%>%
  cor() %>%
  corrplot::corrplot(method = "square", type="upper")

childes_intercept%>%
  filter(complete.cases(.))%>%
  select(mom_mtld_intercept, mom_mattr_intercept, 
         mom_vocd_intercept, mom_ttr_intercept,
         mom_mtld_slope, mom_mattr_slope, 
         mom_vocd_slope, mom_ttr_slope,
         mom_mlu_intercept, mom_mlu_slope)%>%
  cor() %>%
  corrplot::corrplot(method = "square", type="upper")

# plot parameters of child and mother
ldp_intercept%>%
  filter(complete.cases(.))%>%
  select(mom_mtld_intercept, mom_mattr_intercept, 
         mom_vocd_intercept, mom_ttr_intercept,
         mom_mlu_intercept, mom_mtld_slope, mom_mattr_slope, 
         mom_vocd_slope, mom_ttr_slope, mom_mlu_slope,
        mtld_intercept, mtld_slope,
        mattr_intercept, mattr_slope, 
        vocd_intercept, vocd_slope, 
        ttr_intercept, ttr_slope,
        mlu_intercept, mlu_slope,
        sen_intercept, sen_slope)%>%
  cor() %>%
  corrplot::corrplot(method = "square", type="upper")

childes_intercept%>%
  filter(complete.cases(.))%>%
  select(mom_mtld_intercept, mom_mattr_intercept,
         mom_vocd_intercept, mom_ttr_intercept,
         mom_mtld_slope, mom_mattr_slope,
         mom_vocd_slope, mom_ttr_slope,
         mom_mlu_intercept, mom_mlu_slope,
         mtld_intercept, mtld_slope,
         mattr_intercept,mattr_slope,
         vocd_intercept, vocd_slope,
         ttr_intercept, ttr_slope,
         mlu_intercept, mlu_slope)%>%
  cor() %>%
  corrplot::corrplot(method = "square", type="upper")